On centre sans réduire les données, puis calcul de la variance de X, ensuite la valeur propre de la matrice de variance.
# Data input
X <- read.table(text = "
math scie fran lati d-m
jean 6.0 6.0 5.0 5.5 8.0
aline 8.0 8.0 8.0 8.0 9.0
annie 6.0 7.0 11.0 9.5 11.0
monique 14.5 14.5 15.5 15.0 8.0
didier 14.0 14.0 12.0 12.5 10.0
andre 11.0 10.0 5.5 7.0 13.0
pierre 5.5 7.0 14.0 11.5 10.0
brigitte 13.0 12.5 8.5 9.5 12.0
evelyne 9.0 9.5 12.5 12.0 18.0
")
# create a table in pdf
knitr::kable(X, format = "latex", caption = "Tableau centré", digits = 2)
X
X <- scale(X, center = TRUE, scale = FALSE)
S <- var(X) # la matrice de variance
res.mypca <- eigen(S) # compute the eigenvalues and eigenvectors of S
Lambda <- res.mypca$values # the eigenvalues of S
Lambda
## [1] 31.78490603 13.58406368 9.69270028 0.02444871 0.01110353
U <- res.mypca$vectors # the eigenvector of S
C <- X%*%U
plot(C[,1:2], col = "red") # run plot together with text
text(C[,1:2], row.names(X))
# Compare with code
res.pca <- prcomp(X) # Principale Components Analysis
summary(res.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 5.6378 3.6857 3.1133 0.15636 0.1054
## Proportion of Variance 0.5769 0.2465 0.1759 0.00044 0.0002
## Cumulative Proportion 0.5769 0.8234 0.9993 0.99980 1.0000
lamb <- res.pca$sdev**2
lamb
## [1] 31.78490603 13.58406368 9.69270028 0.02444871 0.01110353
barplot(cumsum(lamb)/sum(lamb)) # we see last 3 variables are almost equal to 1,
# which means that only these 3 variables will give us almost 100% of their information
plot(res.pca) #give the variance of each variables
biplot(res.pca) # give the direction of 5 variables into 2 new PCs
abline(h=0, v=0)
biplot(res.pca, choices = c(2,3)) #PC2 = C2, PC1 = C2 where PC is Principale Component
abline(h=0, v=0)
library(MASS)
data(crabs)
crabsquant <- crabs[,4:8] # we don't want those 3 first variables qualitative
dim(crabsquant) # the dataset now consists of 200 crabs, with 5 variables quantitative
## [1] 200 5
let’s see the correlation matrix:
cor(crabsquant)
## FL RW CL CW BD
## FL 1.0000000 0.9069876 0.9788418 0.9649558 0.9876272
## RW 0.9069876 1.0000000 0.8927430 0.9004021 0.8892054
## CL 0.9788418 0.8927430 1.0000000 0.9950225 0.9832038
## CW 0.9649558 0.9004021 0.9950225 1.0000000 0.9678117
## BD 0.9876272 0.8892054 0.9832038 0.9678117 1.0000000
Those variables are extremely correlated.
# 1) Test a PCA crabsquant without any preliminary transformation
crabs.pca <- prcomp(scale(crabsquant))
summary(crabs.pca) # the proportion of variance of PC1 is: 98.25% and PC2 is: 0.906%
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 2.1883 0.38947 0.21595 0.10552 0.04137
## Proportion of Variance 0.9578 0.03034 0.00933 0.00223 0.00034
## Cumulative Proportion 0.9578 0.98810 0.99743 0.99966 1.00000
plot(crabs.pca) # the variance of each variables
biplot(crabs.pca) # the direction of 5 variables into PC1 and PC2
abline(h=0, v=0) # add the line h=0 and v=0
# 2) We use the library FactorMineR to improve the quality of PCA
library(FactoMineR)
sex.sp <- crabs$sex:crabs$sp # related sex vs species
X <- data.frame(crabsquant, sex.sp) # add the column sex.sp to the dataframe with crabsquant data
dim(X)
## [1] 200 6
res <- PCA(X, quali.sup = 6) # the 6th vector indicates the indexes of the categorical supplementary variables
plot.PCA(res, axes = c(2, 3),
col.ind.sup = c("blue1", "orange1", "blue4", "orange4")[sex.sp])
pairs(crabsquant,col=c("blue","orange")[crabs$sp],pch=c(20,21)[crabs$sex])
res<-princomp(scale(crabsquant))
plot(res$scores[,1:2],col=c("blue","orange")[crabs$sp],pch=c(20,21)[crabs$sex])
library(ggplot2)
respca <- prcomp(X[,1:5], scale. = TRUE)
respca
## Standard deviations (1, .., p=5):
## [1] 2.18834065 0.38946785 0.21594669 0.10552420 0.04137243
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4 PC5
## FL 0.4520437 0.1375813 0.53076841 0.696923372 0.09649156
## RW 0.4280774 -0.8981307 -0.01197915 -0.083703203 -0.05441759
## CL 0.4531910 0.2682381 -0.30968155 -0.001444633 -0.79168267
## CW 0.4511127 0.1805959 -0.65256956 0.089187816 0.57452672
## BD 0.4511336 0.2643219 0.44316103 -0.706636423 0.17574331
p <- ggplot(data = data.frame(respca$x), aes(x=PC2, y=PC3)) +
geom_point(colour = c("blue1", "orange1", "blue4", "orange4")[sex.sp]) +
geom_vline(xintercept = 0) + # vertical line
geom_hline(yintercept = 0) + # horizontal line
geom_density2d() # add the density in 2d (contour line)
p
ggsave(filename = "pca.pdf", p) # save file with name "pca.pdf", open in terminal by tapping p
## Saving 7 x 5 in image
Let’s us check that it’s a dissimilarity dataset if it follows the conditions below:
The matrix is symmetric
The diagonal of the matrix is equal to zeros
# 1) Load the data
Delta <- read.table(file = "neighbor_globin.txt", header = FALSE, row.names = 1)
Delta <- as.matrix(Delta) # Delta here is (Dij) as a matrix
dim(Delta) # Delta is a squared matrix with n=21
## [1] 21 21
head(Delta)
## V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## MYG_PHYCA 0.0000 0.1806 0.2434 0.3964 0.5656 0.4987 1.9654 2.1040 2.1278 2.0965
## MYG_HUMAN 0.1806 0.0000 0.1929 0.2997 0.4852 0.4271 1.9675 2.0689 2.2427 2.1483
## MYG_MOUSE 0.2434 0.1929 0.0000 0.3432 0.5312 0.4635 1.8727 2.1478 2.1478 2.1092
## MYG_CHICK 0.3964 0.2997 0.3432 0.0000 0.3657 0.3196 1.8520 2.0577 2.0649 1.8216
## MYG_ALLMI 0.5656 0.4852 0.5312 0.3657 0.0000 0.2970 1.8912 2.0551 2.0572 1.7896
## MYG_CHEMY 0.4987 0.4271 0.4635 0.3196 0.2970 0.0000 1.7142 1.9036 1.9751 1.6927
## V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## MYG_PHYCA 2.2725 2.0807 1.9645 1.9928 1.9195 2.0944 1.9867 1.9486 1.8515 1.9880
## MYG_HUMAN 2.2753 2.0387 2.0941 2.1273 1.9495 2.0628 2.1114 1.9951 1.9200 2.0044
## MYG_MOUSE 2.2318 1.9386 2.0581 2.0567 1.9920 2.1235 2.1776 2.0310 1.9519 2.0735
## MYG_CHICK 1.9345 2.0096 1.9935 2.0463 1.8520 1.9878 2.1320 1.9407 1.8823 2.0378
## MYG_ALLMI 1.9478 1.9237 1.7647 1.9622 1.9429 1.9423 2.0500 1.9352 1.9823 2.0511
## MYG_CHEMY 1.8907 1.8523 1.8770 1.8414 1.7849 1.8503 1.9604 1.9075 1.8643 1.7584
## V22
## MYG_PHYCA 2.6100
## MYG_HUMAN 2.5663
## MYG_MOUSE 2.6225
## MYG_CHICK 2.5424
## MYG_ALLMI 2.3154
## MYG_CHEMY 2.4536
# 2) Check the dissimilarities properties (check that these scores correspond well to dissimilarities)
diag(Delta) # is egal to 0 (which is good)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
sum(Delta - t(Delta)) # implies that Delta = t(Delta) <=> sum(Delta - t(Delta)) = 0
## [1] 0
# => this matrix is symmetric
# 3) Compute the matrix Delta of squared dissimilarities
Delta3 <- Delta%*%Delta
Delta2 <- Delta^2
# 4) Compute the centering matrix J : J = I - (1/n)1(n,n)
n <- ncol(Delta) # = nrow(Delta)
J <- diag(rep(1,n)) - (1/n)*matrix(1,n,n)
#J
# 5) Compute B = -1/2*J*Delta*J
B <- -(1/2)*J%*%Delta2%*%J
# B could be interpreted as a "pseudo" scal product
# 6 + 7) Perform the spectral decomposition of B
EigenB <- eigen(B)
Lambda <- EigenB$values
U <- EigenB$vectors # eigen vectors of B
# U%*%diag(Lambda)%*%t(U) = B, check
barplot(Lambda)
# We keep the first 3 largest eigenvalues which are positive.
print(paste("The eigen values which are negative: "))
## [1] "The eigen values which are negative: "
which(Lambda<0)
## [1] 13 14 15 16 17 18 19 20 21
#print(paste("The eigen values which are negative: ", which(Lambda<0)))
The 6 first are myoglobines. The 7 next are Hemoglobines beta. The 7 next are Hemoglobines alpha The last one is a Globine 3.
# 8 + 9)
C13 <- U[,1:3] # we consider only the first 3 columns
Lambda13 <- Lambda[1:3]
X <- C13%*%diag(Lambda13)
X
## [,1] [,2] [,3]
## [1,] -5.276621 0.28071344 -0.25823651
## [2,] -5.487849 0.23008442 -0.07782307
## [3,] -5.445457 -0.09589149 -0.17885550
## [4,] -5.055670 -0.25537567 -0.13507955
## [5,] -4.833841 -0.26835562 0.40520087
## [6,] -4.521246 -0.20279045 -0.14967938
## [7,] 1.760164 -1.10955307 -0.03572142
## [8,] 2.372571 -1.21231275 0.14725085
## [9,] 2.604147 -1.16686912 -0.25369313
## [10,] 1.968873 -1.29156699 -0.08037707
## [11,] 2.630732 -1.15512949 -0.03790172
## [12,] 2.092185 -0.99181637 0.12099230
## [13,] 1.801248 -1.10014691 0.18096727
## [14,] 2.164523 1.01342495 -0.65430466
## [15,] 1.756286 0.98218221 -0.54749710
## [16,] 2.064014 1.02604838 -0.88438899
## [17,] 2.449972 1.22905080 -0.06411919
## [18,] 2.041120 1.13724030 -0.15555672
## [19,] 1.832813 1.06800316 -0.30258986
## [20,] 1.885433 0.93428258 -0.61385487
## [21,] 1.196604 0.94877768 3.57526744
rownames(Delta)
## [1] "MYG_PHYCA" "MYG_HUMAN" "MYG_MOUSE" "MYG_CHICK" "MYG_ALLMI"
## [6] "MYG_CHEMY" "HBB_CHICK" "HBB_CHRPI" "HBB1_IGUIG" "HBB_PHYCA"
## [11] "HBB_HUMAN" "HBB1_MOUSE" "HBB_ALLMI" "HBA_CHICK" "HBA_CHRPI"
## [16] "HBA_ALLMI" "HBA_PHYCA" "HBA_HUMAN" "HBA_MOUSE" "HBA1_IGUIG"
## [21] "GLB3_MYXGL"
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.0 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::select() masks MASS::select()
X <- as_tibble(X)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
names(X) <- paste("PC", 1:3, sep = "") # name is required, otherwise error
ggplot(data = X, aes(x = PC1, y = PC2, label = row.names(Delta))) +
geom_point(col = "red") + geom_text(check_overlap = TRUE, col = "blue")
# 10) Use cmdscale function
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
library(FactoMineR)
library(tidyverse)
data("spam")
dim(spam)
## [1] 4601 58
spam %>% PCA(., quali.sup = ncol(spam)) %>%
plot(habillage = ncol(spam), choix = "ind")
res.pca = PCA(spam, quali.sup = ncol(spam))
plot(res.pca, habillage = ncol(spam), choix = "ind")
computeJ<-function(n){
diag(rep(1,n)) - 1/n * matrix(1,n,n)->J
}
print(computeJ(2))
## [,1] [,2]
## [1,] 0.5 -0.5
## [2,] -0.5 0.5
PCoA<-eigen(B)
plot(PCoA$values)
abline(h=0)
eigenval<-diag(PCoA$values)
eigenvect<-PCoA$vectors
q<-3
X<-eigenvect[,1:q]%*% sqrt(eigenval[1:q,1:q])
rownames(X)<-rownames(D)
pdf("PCoA.pdf")
plot(X[,1:2])
text(x=X[,1],y=X[,2],labels=rownames(X))
dev.off()
## quartz_off_screen
## 2
# Type of proteins
pch.type <- c(rep(1,6),rep(2,7), rep(3,7),4)
# Different colors for the species
colors <- c(1:6,4,7,8,1:3,5,4,7,5,1:3,8,9)
plot(X[,1:2],pch=pch.type,col=colors)
#text(x=X[,1],y=X[,2],labels=rownames(X),cex=0.5)
afp <- cmdscale(as.matrix(Delta), k=4, eig=TRUE)
pch.type <- c(rep(1,6),rep(2,7), rep(3,7),4)
colors <- c(1:6,4,7,8,1:3,5,4,7,5,1:3,8,9)
plot(afp$points,pch=pch.type,col = rainbow(9)[colors])
legend("topleft",legend=c("Myoglobin","Hemoglobin Beta","Hemoglobin Alpha","Globin-3"),pch=1:4)